package max.nlp.wrappers.berkeley;


import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;

import max.nlp.dal.generic.GenericDB;
import max.nlp.dal.wiktionary.types.Text;
import max.nlp.dal.wiktionary.types.Translation;

import org.apache.log4j.Logger;


/**
 * @author max.kaufmann
 *
 */
/**
 * @author max.kaufmann
 * 
 */
public class BerkeleyAligner {

	private static final double THRESHOLD = .3;
	private String CONFIG_FILE = "C:\\temp\\berk\\berk.conf";
	private String EXEC_DIR = "C:\\temp\\berk\\output\\";
	private String L1 = "e";
	private String L2 = "f";
	private String TRAIN_DIR = "C:\\Users\\max.kaufmann\\Downloads\\berkeleyaligner_unsupervised-2.1.tar\\berkeleyaligner_unsupervised-2.1\\berkeleyaligner\\example\\train\\";
	static Logger log = Logger.getLogger(BerkeleyAligner.class);

	public void generateConfigFile() {
		try {
			File out = new File(CONFIG_FILE);
			PrintWriter configOutput = new PrintWriter(new FileWriter(out));
			configOutput.println("forwardModels	MODEL1 HMM");
			configOutput.println("reverseModels	MODEL1 HMM");
			configOutput.println("mode	JOINT JOINT");
			configOutput.println("iters	2 2");
			configOutput.println("execDir   " + EXEC_DIR);
			configOutput.println("create    true");
			configOutput.println("overwriteExecDir  true");
			configOutput.println("saveParams        true");
			configOutput.println("numThreads        3");
			configOutput.println("testSources       " + "");

			configOutput.println("msPerLine 10000");
			configOutput.println("alignTraining");
			// configOutput.println("leaveTrainingOnDisk");
			configOutput.println("foreignSuffix	" + L1);
			configOutput.println("englishSuffix	" + L2);
			configOutput.println("lowercase");
			configOutput.println("trainSources      " + TRAIN_DIR);
			configOutput.println("sentences MAX");
			configOutput.println("maxTestSentences  500");
			configOutput.println("offsetTestSentences       0");
			configOutput.println("saveLexicalWeights");
			configOutput.println("competitiveThresholding");
			configOutput.println("dictionary");
			configOutput.println("aligntraining");
			// configOutput.println("allowedprecisionconflicts 8");
			// configOutput.println("inferenceModeC     VITERBI_ITG");
			// configOutput.println("objMode    NONE");
			// configOutput.println("pruneThresh       1e-4");
			// configOutput.println("prunePredict      true");
			configOutput.flush();
			configOutput.close();
		} catch (IOException e) {
			System.out.println("Can't read config file");
			System.out.println(e.getMessage());
		}
	}

	public void generateWordAlignments() {
		if (!new File(CONFIG_FILE).exists()) {
			generateConfigFile();
			log.info("Config file [" + CONFIG_FILE + "] already exists");
		} else {
			log.info("Generating config file at [" + CONFIG_FILE + "]");
		}
		log.info("Training aligner ");
		String[] args = new String[1];
		args[0] = "++" + CONFIG_FILE;
		edu.berkeley.nlp.wordAlignment.Main.main(args);

	}

	/**
	 * Puts all translations above THRESHOLD into mongodb Assumes that there are
	 * only 2 languages in EXEC_DIr, and that the translations are in
	 * 1.lexweights and 2.lexWeights
	 */
	public void loadWordAlignmentsIntoMongo() {
		String L1lexweightFile = EXEC_DIR + "1.lexweights";
		String L2lexweightFile = EXEC_DIR + "2.lexweights";

		ArrayList<Translation> translations = new ArrayList<Translation>();
		translations.addAll(readTranslationsFromLexweightsFile(L1lexweightFile,
				L1));
		translations.addAll(readTranslationsFromLexweightsFile(L2lexweightFile,
				L2));

		GenericDB db = GenericDB.getInstance();
		for (Translation t : translations) {
			db.saveEntry(t);
		}

	}

	public ArrayList<Translation> readTranslationsFromLexweightsFile(
			String file, String sourceLanguage) {
		ArrayList<Translation> translations = new ArrayList<Translation>();

		String targetLanguage = "";
		if (sourceLanguage.equals(L1))
			targetLanguage = L2;
		else
			targetLanguage = L1;

		try {
			BufferedReader wordFile = new BufferedReader(new FileReader(file));
			String line;
			String sourceWord = "";
			double entropy = -1;
			while ((line = wordFile.readLine()) != null) {
				String[] temp = line.split("\t");
				if (line.contains("nTrans")) {
					sourceWord = temp[0];
					entropy = Double.parseDouble(temp[1].split(" ")[1]);
				} else {
					try {
						temp = line.split(":");
						String targetWord = temp[0];
						double translationProb = Double.parseDouble(temp[1]);

						if (translationProb >= THRESHOLD) {
							Text w1 = new Text(sourceWord, sourceLanguage);
							Text w2 = new Text(targetWord, targetLanguage);
							Translation t = new Translation(w1, w2);
							t.setProb(translationProb);
							if (entropy != -1) {
								t.setEntropy(entropy);
							}
							translations.add(t);
						}
					} catch (NumberFormatException e) {
						e.printStackTrace();
					}
				}
			}
			wordFile.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return translations;
	}

	/**
	 * Example usage
	 * 
	 * @param args
	 */
	public static void main(String[] args) {
		BerkeleyAligner b = new BerkeleyAligner();

		// generate a config file and word alignments
		b.generateConfigFile();
		b.generateWordAlignments();
		
		//load them into mongo. you can also just get them into memory with readTranslationsFromLexweightsFile
		b.loadWordAlignmentsIntoMongo();

	}
}
